#ifndef CUFFTDX_FFT_32_FP64_INV_PTX_HPP
#define CUFFTDX_FFT_32_FP64_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<607, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<14>;
.reg .f64 fd<206>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 8;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd33, %18, %28;
add.f64 fd34, %19, %30;
sub.f64 fd35, %18, %28;
sub.f64 fd36, %19, %30;
add.f64 fd37, %23, %34;
add.f64 fd38, %25, %35;
sub.f64 fd39, %23, %34;
sub.f64 fd40, %25, %35;
add.f64 fd41, fd33, fd37;
add.f64 fd42, fd34, fd38;
sub.f64 fd43, fd33, fd37;
sub.f64 fd44, fd34, fd38;
sub.f64 fd45, fd35, fd40;
add.f64 fd46, fd36, fd39;
add.f64 fd47, fd35, fd40;
sub.f64 fd48, fd36, fd39;
add.f64 fd49, %20, %31;
add.f64 fd50, %22, %33;
sub.f64 fd51, %20, %31;
sub.f64 fd52, %22, %33;
add.f64 fd53, %26, %36;
add.f64 fd54, %27, %37;
sub.f64 fd55, %26, %36;
sub.f64 fd56, %27, %37;
add.f64 fd57, fd49, fd53;
add.f64 fd58, fd50, fd54;
sub.f64 fd59, fd49, fd53;
sub.f64 fd60, fd50, fd54;
sub.f64 fd61, fd51, fd56;
add.f64 fd62, fd52, fd55;
add.f64 fd63, fd51, fd56;
sub.f64 fd64, fd52, fd55;
mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD;
mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD;
sub.f64 fd67, fd65, fd66;
add.f64 fd68, fd65, fd66;
mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD;
mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD;
sub.f64 fd71, fd69, fd70;
mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72;
add.f64 fd74, fd41, fd57;
add.f64 fd75, fd42, fd58;
sub.f64 fd76, fd41, fd57;
sub.f64 fd77, fd42, fd58;
add.f64 fd78, fd45, fd67;
add.f64 fd79, fd46, fd68;
sub.f64 fd80, fd45, fd67;
sub.f64 fd81, fd46, fd68;
sub.f64 fd82, fd43, fd60;
add.f64 fd83, fd44, fd59;
add.f64 fd84, fd43, fd60;
sub.f64 fd85, fd44, fd59;
add.f64 fd86, fd47, fd71;
add.f64 fd87, fd48, fd73;
sub.f64 fd88, fd47, fd71;
sub.f64 fd89, fd48, fd73;
and.b32 r6, r5, 3;
shl.b32 r7, r5, 4;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 48;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd90, fd91}, [rd5];
mul.f64 fd94, fd79, fd91;
fma.rn.f64 fd95, fd90, fd78, fd94;
mul.f64 fd96, fd78, fd91;
mul.f64 fd97, fd90, fd79;
sub.f64 fd98, fd97, fd96;
mul.f64 fd99, fd90, fd90;
mul.f64 fd100, fd91, fd91;
sub.f64 fd101, fd99, fd100;
mul.f64 fd102, fd91, fd90;
fma.rn.f64 fd103, fd91, fd90, fd102;
mul.f64 fd104, fd83, fd103;
fma.rn.f64 fd105, fd101, fd82, fd104;
mul.f64 fd106, fd82, fd103;
mul.f64 fd107, fd101, fd83;
sub.f64 fd108, fd107, fd106;
mul.f64 fd109, fd90, fd101;
mul.f64 fd110, fd91, fd103;
sub.f64 fd111, fd109, fd110;
mul.f64 fd112, fd90, fd103;
fma.rn.f64 fd113, fd91, fd101, fd112;
mul.f64 fd114, fd87, fd113;
fma.rn.f64 fd115, fd111, fd86, fd114;
mul.f64 fd116, fd86, fd113;
mul.f64 fd117, fd111, fd87;
sub.f64 fd118, fd117, fd116;
mul.f64 fd119, fd90, fd111;
mul.f64 fd120, fd91, fd113;
sub.f64 fd121, fd119, fd120;
mul.f64 fd122, fd90, fd113;
fma.rn.f64 fd123, fd91, fd111, fd122;
mul.f64 fd124, fd77, fd123;
fma.rn.f64 fd125, fd121, fd76, fd124;
mul.f64 fd126, fd76, fd123;
mul.f64 fd127, fd121, fd77;
sub.f64 fd128, fd127, fd126;
ld.global.v2.f64 {fd129, fd130}, [rd5+64];
mul.f64 fd133, fd81, fd130;
fma.rn.f64 fd134, fd129, fd80, fd133;
mul.f64 fd135, fd80, fd130;
mul.f64 fd136, fd129, fd81;
sub.f64 fd137, fd136, fd135;
mul.f64 fd138, fd90, fd129;
mul.f64 fd139, fd91, fd130;
sub.f64 fd140, fd138, fd139;
mul.f64 fd141, fd90, fd130;
fma.rn.f64 fd142, fd91, fd129, fd141;
mul.f64 fd143, fd85, fd142;
fma.rn.f64 fd144, fd140, fd84, fd143;
mul.f64 fd145, fd84, fd142;
mul.f64 fd146, fd140, fd85;
sub.f64 fd147, fd146, fd145;
mul.f64 fd148, fd90, fd140;
mul.f64 fd149, fd91, fd142;
sub.f64 fd150, fd148, fd149;
mul.f64 fd151, fd90, fd142;
fma.rn.f64 fd152, fd91, fd140, fd151;
mul.f64 fd153, fd89, fd152;
fma.rn.f64 fd154, fd150, fd88, fd153;
mul.f64 fd155, fd88, fd152;
mul.f64 fd156, fd150, fd89;
sub.f64 fd157, fd156, fd155;
shl.b32 r8, r5, 6;
and.b32 r9, r8, -256;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 192;
add.s32 r12, r10, r11;
st.shared.v2.f64 [r12], {fd74, fd95};
st.shared.v2.f64 [r12+16], {fd105, fd115};
st.shared.v2.f64 [r12+32], {fd125, fd134};
st.shared.v2.f64 [r12+48], {fd144, fd154};
barrier.sync 0;
mad.lo.s32 r13, r6, -56, r12;
ld.shared.f64 fd158, [r13];
ld.shared.f64 fd159, [r13+32];
ld.shared.f64 fd160, [r13+64];
ld.shared.f64 fd161, [r13+96];
ld.shared.f64 fd162, [r13+128];
ld.shared.f64 fd163, [r13+160];
ld.shared.f64 fd164, [r13+192];
ld.shared.f64 fd165, [r13+224];
barrier.sync 0;
st.shared.v2.f64 [r12], {fd75, fd98};
st.shared.v2.f64 [r12+16], {fd108, fd118};
st.shared.v2.f64 [r12+32], {fd128, fd137};
st.shared.v2.f64 [r12+48], {fd147, fd157};
barrier.sync 0;
ld.shared.f64 fd166, [r13];
ld.shared.f64 fd167, [r13+32];
ld.shared.f64 fd168, [r13+64];
ld.shared.f64 fd169, [r13+96];
ld.shared.f64 fd170, [r13+128];
ld.shared.f64 fd171, [r13+160];
ld.shared.f64 fd172, [r13+192];
ld.shared.f64 fd173, [r13+224];
add.f64 fd174, fd158, fd162;
add.f64 fd175, fd166, fd170;
sub.f64 fd176, fd158, fd162;
sub.f64 fd177, fd166, fd170;
add.f64 fd178, fd160, fd164;
add.f64 fd179, fd168, fd172;
sub.f64 fd180, fd160, fd164;
sub.f64 fd181, fd168, fd172;
add.f64 fd182, fd159, fd163;
add.f64 fd183, fd167, fd171;
sub.f64 fd184, fd159, fd163;
sub.f64 fd185, fd167, fd171;
add.f64 fd186, fd161, fd165;
add.f64 fd187, fd169, fd173;
sub.f64 fd188, fd161, fd165;
sub.f64 fd189, fd169, fd173;
add.f64 %0, fd174, fd178;
add.f64 %1, fd175, fd179;
add.f64 %2, fd182, fd186;
add.f64 %3, fd183, fd187;
add.f64 %5, fd177, fd180;
sub.f64 %4, fd176, fd181;
add.f64 %7, fd185, fd188;
sub.f64 %6, fd184, fd189;
sub.f64 %8, fd174, fd178;
sub.f64 %9, fd175, fd179;
sub.f64 %10, fd182, fd186;
sub.f64 %11, fd183, fd187;
sub.f64 %13, fd177, fd180;
add.f64 %12, fd176, fd181;
sub.f64 %15, fd185, fd188;
add.f64 %14, fd184, fd189;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<608, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<21>;
.reg .f64 fd<145>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd17, %11, %16;
add.f64 fd18, %12, %18;
sub.f64 fd19, %11, %16;
sub.f64 fd20, %12, %18;
add.f64 fd21, %13, %19;
add.f64 fd22, %15, %20;
sub.f64 fd23, %13, %19;
sub.f64 fd24, %15, %20;
sub.f64 fd25, fd17, fd21;
sub.f64 fd26, fd18, fd22;
sub.f64 fd27, fd19, fd24;
add.f64 fd28, fd20, fd23;
add.f64 fd29, fd19, fd24;
sub.f64 fd30, fd20, fd23;
and.b32 r6, r5, 7;
shl.b32 r7, r5, 6;
and.b32 r8, r7, -512;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 4;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 112;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd31, fd32}, [rd5];
mul.f64 fd35, fd28, fd32;
mul.f64 fd36, fd27, fd32;
mul.f64 fd37, fd31, fd28;
mul.f64 fd38, fd31, fd31;
mul.f64 fd39, fd32, fd32;
sub.f64 fd40, fd38, fd39;
mul.f64 fd41, fd32, fd31;
fma.rn.f64 fd42, fd32, fd31, fd41;
mul.f64 fd43, fd26, fd42;
mul.f64 fd44, fd25, fd42;
mul.f64 fd45, fd40, fd26;
ld.global.v2.f64 {fd46, fd47}, [rd5+128];
mul.f64 fd50, fd30, fd47;
mul.f64 fd51, fd29, fd47;
mul.f64 fd52, fd46, fd30;
barrier.sync 0;
and.b32 r11, r7, 448;
add.s32 r12, r9, r11;
add.f64 fd53, fd18, fd22;
add.f64 fd54, fd17, fd21;
st.shared.v2.f64 [r12], {fd54, fd53};
fma.rn.f64 fd55, fd31, fd27, fd35;
sub.f64 fd56, fd37, fd36;
st.shared.v2.f64 [r12+16], {fd55, fd56};
sub.f64 fd57, fd45, fd44;
fma.rn.f64 fd58, fd40, fd25, fd43;
st.shared.v2.f64 [r12+32], {fd58, fd57};
fma.rn.f64 fd59, fd46, fd29, fd50;
sub.f64 fd60, fd52, fd51;
st.shared.v2.f64 [r12+48], {fd59, fd60};
barrier.sync 0;
mad.lo.s32 r13, r6, -48, r12;
ld.shared.v2.f64 {fd61, fd62}, [r13];
ld.shared.v2.f64 {fd65, fd66}, [r13+128];
ld.shared.v2.f64 {fd69, fd70}, [r13+256];
ld.shared.v2.f64 {fd73, fd74}, [r13+384];
add.f64 fd77, fd61, fd69;
add.f64 fd78, fd62, fd70;
sub.f64 fd79, fd61, fd69;
sub.f64 fd80, fd62, fd70;
add.f64 fd81, fd65, fd73;
add.f64 fd82, fd66, fd74;
sub.f64 fd83, fd65, fd73;
sub.f64 fd84, fd66, fd74;
sub.f64 fd85, fd77, fd81;
sub.f64 fd86, fd78, fd82;
sub.f64 fd87, fd79, fd84;
add.f64 fd88, fd80, fd83;
add.f64 fd89, fd79, fd84;
sub.f64 fd90, fd80, fd83;
and.b32 r14, r5, 4;
bfe.u32 r15, r5, 2, 1;
mul.wide.u32 rd6, r15, 16;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f64 {fd91, fd92}, [rd8];
mul.f64 fd95, fd88, fd92;
mul.f64 fd96, fd87, fd92;
mul.f64 fd97, fd91, fd88;
mul.f64 fd98, fd91, fd91;
mul.f64 fd99, fd92, fd92;
sub.f64 fd100, fd98, fd99;
mul.f64 fd101, fd92, fd91;
fma.rn.f64 fd102, fd92, fd91, fd101;
mul.f64 fd103, fd86, fd102;
mul.f64 fd104, fd85, fd102;
mul.f64 fd105, fd100, fd86;
ld.global.v2.f64 {fd106, fd107}, [rd8+32];
mul.f64 fd110, fd90, fd107;
mul.f64 fd111, fd89, fd107;
mul.f64 fd112, fd106, fd90;
and.b32 r16, r10, 48;
add.s32 r17, r9, r16;
barrier.sync 0;
and.b32 r18, r7, 256;
add.s32 r19, r17, r18;
add.f64 fd113, fd78, fd82;
add.f64 fd114, fd77, fd81;
st.shared.v2.f64 [r19], {fd114, fd113};
fma.rn.f64 fd115, fd91, fd87, fd95;
sub.f64 fd116, fd97, fd96;
st.shared.v2.f64 [r19+64], {fd115, fd116};
fma.rn.f64 fd117, fd100, fd85, fd103;
sub.f64 fd118, fd105, fd104;
st.shared.v2.f64 [r19+128], {fd117, fd118};
fma.rn.f64 fd119, fd106, fd89, fd110;
sub.f64 fd120, fd112, fd111;
st.shared.v2.f64 [r19+192], {fd119, fd120};
barrier.sync 0;
mad.lo.s32 r20, r14, -48, r19;
ld.shared.v2.f64 {fd121, fd122}, [r20];
ld.shared.v2.f64 {fd125, fd126}, [r20+128];
ld.shared.v2.f64 {fd129, fd130}, [r20+256];
ld.shared.v2.f64 {fd133, fd134}, [r20+384];
add.f64 %1, fd122, fd130;
add.f64 %0, fd121, fd129;
add.f64 %3, fd126, fd134;
add.f64 %2, fd125, fd133;
sub.f64 %5, fd122, fd130;
sub.f64 %4, fd121, fd129;
sub.f64 %7, fd126, fd134;
sub.f64 %6, fd125, fd133;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<609, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<14>;
.reg .f64 fd<222>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd33, %18, %28;
add.f64 fd34, %19, %30;
sub.f64 fd35, %18, %28;
sub.f64 fd36, %19, %30;
add.f64 fd37, %23, %34;
add.f64 fd38, %25, %35;
sub.f64 fd39, %23, %34;
sub.f64 fd40, %25, %35;
add.f64 fd41, fd33, fd37;
add.f64 fd42, fd34, fd38;
sub.f64 fd43, fd33, fd37;
sub.f64 fd44, fd34, fd38;
sub.f64 fd45, fd35, fd40;
add.f64 fd46, fd36, fd39;
add.f64 fd47, fd35, fd40;
sub.f64 fd48, fd36, fd39;
add.f64 fd49, %20, %31;
add.f64 fd50, %22, %33;
sub.f64 fd51, %20, %31;
sub.f64 fd52, %22, %33;
add.f64 fd53, %26, %36;
add.f64 fd54, %27, %37;
sub.f64 fd55, %26, %36;
sub.f64 fd56, %27, %37;
add.f64 fd57, fd49, fd53;
add.f64 fd58, fd50, fd54;
sub.f64 fd59, fd49, fd53;
sub.f64 fd60, fd50, fd54;
sub.f64 fd61, fd51, fd56;
add.f64 fd62, fd52, fd55;
add.f64 fd63, fd51, fd56;
sub.f64 fd64, fd52, fd55;
mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD;
mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD;
sub.f64 fd67, fd65, fd66;
add.f64 fd68, fd65, fd66;
mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD;
mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD;
sub.f64 fd71, fd69, fd70;
mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72;
sub.f64 fd74, fd41, fd57;
sub.f64 fd75, fd42, fd58;
add.f64 fd76, fd45, fd67;
add.f64 fd77, fd46, fd68;
sub.f64 fd78, fd45, fd67;
sub.f64 fd79, fd46, fd68;
sub.f64 fd80, fd43, fd60;
add.f64 fd81, fd44, fd59;
add.f64 fd82, fd43, fd60;
sub.f64 fd83, fd44, fd59;
add.f64 fd84, fd47, fd71;
add.f64 fd85, fd48, fd73;
sub.f64 fd86, fd47, fd71;
sub.f64 fd87, fd48, fd73;
and.b32 r6, r5, 3;
shl.b32 r7, r5, 7;
and.b32 r8, r7, -512;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 4;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 48;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd88, fd89}, [rd5];
mul.f64 fd92, fd77, fd89;
mul.f64 fd93, fd76, fd89;
mul.f64 fd94, fd88, fd77;
mul.f64 fd95, fd88, fd88;
mul.f64 fd96, fd89, fd89;
sub.f64 fd97, fd95, fd96;
mul.f64 fd98, fd89, fd88;
fma.rn.f64 fd99, fd89, fd88, fd98;
mul.f64 fd100, fd81, fd99;
mul.f64 fd101, fd80, fd99;
mul.f64 fd102, fd97, fd81;
mul.f64 fd103, fd88, fd97;
mul.f64 fd104, fd89, fd99;
sub.f64 fd105, fd103, fd104;
mul.f64 fd106, fd88, fd99;
fma.rn.f64 fd107, fd89, fd97, fd106;
mul.f64 fd108, fd85, fd107;
mul.f64 fd109, fd84, fd107;
mul.f64 fd110, fd105, fd85;
mul.f64 fd111, fd88, fd105;
mul.f64 fd112, fd89, fd107;
sub.f64 fd113, fd111, fd112;
mul.f64 fd114, fd88, fd107;
fma.rn.f64 fd115, fd89, fd105, fd114;
mul.f64 fd116, fd75, fd115;
mul.f64 fd117, fd74, fd115;
mul.f64 fd118, fd113, fd75;
ld.global.v2.f64 {fd119, fd120}, [rd5+64];
mul.f64 fd123, fd79, fd120;
mul.f64 fd124, fd78, fd120;
mul.f64 fd125, fd119, fd79;
mul.f64 fd126, fd88, fd119;
mul.f64 fd127, fd89, fd120;
sub.f64 fd128, fd126, fd127;
mul.f64 fd129, fd88, fd120;
fma.rn.f64 fd130, fd89, fd119, fd129;
mul.f64 fd131, fd83, fd130;
mul.f64 fd132, fd82, fd130;
mul.f64 fd133, fd128, fd83;
mul.f64 fd134, fd88, fd128;
mul.f64 fd135, fd89, fd130;
sub.f64 fd136, fd134, fd135;
mul.f64 fd137, fd88, fd130;
fma.rn.f64 fd138, fd89, fd128, fd137;
mul.f64 fd139, fd87, fd138;
mul.f64 fd140, fd86, fd138;
mul.f64 fd141, fd136, fd87;
barrier.sync 0;
and.b32 r11, r7, 384;
add.s32 r12, r9, r11;
add.f64 fd142, fd42, fd58;
add.f64 fd143, fd41, fd57;
st.shared.v2.f64 [r12], {fd143, fd142};
fma.rn.f64 fd144, fd88, fd76, fd92;
sub.f64 fd145, fd94, fd93;
st.shared.v2.f64 [r12+16], {fd144, fd145};
fma.rn.f64 fd146, fd97, fd80, fd100;
sub.f64 fd147, fd102, fd101;
st.shared.v2.f64 [r12+32], {fd146, fd147};
sub.f64 fd148, fd110, fd109;
fma.rn.f64 fd149, fd105, fd84, fd108;
st.shared.v2.f64 [r12+48], {fd149, fd148};
fma.rn.f64 fd150, fd113, fd74, fd116;
sub.f64 fd151, fd118, fd117;
st.shared.v2.f64 [r12+64], {fd150, fd151};
fma.rn.f64 fd152, fd119, fd78, fd123;
sub.f64 fd153, fd125, fd124;
st.shared.v2.f64 [r12+80], {fd152, fd153};
fma.rn.f64 fd154, fd128, fd82, fd131;
sub.f64 fd155, fd133, fd132;
st.shared.v2.f64 [r12+96], {fd154, fd155};
sub.f64 fd156, fd141, fd140;
fma.rn.f64 fd157, fd136, fd86, fd139;
st.shared.v2.f64 [r12+112], {fd157, fd156};
barrier.sync 0;
mad.lo.s32 r13, r6, -112, r12;
ld.shared.v2.f64 {fd158, fd159}, [r13];
ld.shared.v2.f64 {fd162, fd163}, [r13+64];
ld.shared.v2.f64 {fd166, fd167}, [r13+128];
ld.shared.v2.f64 {fd170, fd171}, [r13+192];
ld.shared.v2.f64 {fd174, fd175}, [r13+256];
ld.shared.v2.f64 {fd178, fd179}, [r13+320];
ld.shared.v2.f64 {fd182, fd183}, [r13+384];
ld.shared.v2.f64 {fd186, fd187}, [r13+448];
add.f64 fd190, fd158, fd174;
add.f64 fd191, fd159, fd175;
sub.f64 fd192, fd158, fd174;
sub.f64 fd193, fd159, fd175;
add.f64 fd194, fd166, fd182;
add.f64 fd195, fd167, fd183;
sub.f64 fd196, fd166, fd182;
sub.f64 fd197, fd167, fd183;
add.f64 fd198, fd162, fd178;
add.f64 fd199, fd163, fd179;
sub.f64 fd200, fd162, fd178;
sub.f64 fd201, fd163, fd179;
add.f64 fd202, fd170, fd186;
add.f64 fd203, fd171, fd187;
sub.f64 fd204, fd170, fd186;
sub.f64 fd205, fd171, fd187;
add.f64 %1, fd191, fd195;
add.f64 %0, fd190, fd194;
add.f64 %3, fd199, fd203;
add.f64 %2, fd198, fd202;
add.f64 %5, fd193, fd196;
sub.f64 %4, fd192, fd197;
add.f64 %7, fd201, fd204;
sub.f64 %6, fd200, fd205;
sub.f64 %9, fd191, fd195;
sub.f64 %8, fd190, fd194;
sub.f64 %11, fd199, fd203;
sub.f64 %10, fd198, fd202;
sub.f64 %13, fd193, fd196;
add.f64 %12, fd192, fd197;
sub.f64 %15, fd201, fd204;
add.f64 %14, fd200, fd205;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<610, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<22>;
.reg .f64 fd<129>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 8;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd17, %11, %16;
add.f64 fd18, %12, %18;
sub.f64 fd19, %11, %16;
sub.f64 fd20, %12, %18;
add.f64 fd21, %13, %19;
add.f64 fd22, %15, %20;
sub.f64 fd23, %13, %19;
sub.f64 fd24, %15, %20;
add.f64 fd25, fd17, fd21;
add.f64 fd26, fd18, fd22;
sub.f64 fd27, fd17, fd21;
sub.f64 fd28, fd18, fd22;
sub.f64 fd29, fd19, fd24;
add.f64 fd30, fd20, fd23;
add.f64 fd31, fd19, fd24;
sub.f64 fd32, fd20, fd23;
and.b32 r6, r5, 7;
shl.b32 r7, r5, 4;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 112;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd33, fd34}, [rd5];
mul.f64 fd37, fd30, fd34;
fma.rn.f64 fd38, fd33, fd29, fd37;
mul.f64 fd39, fd29, fd34;
mul.f64 fd40, fd33, fd30;
sub.f64 fd41, fd40, fd39;
mul.f64 fd42, fd33, fd33;
mul.f64 fd43, fd34, fd34;
sub.f64 fd44, fd42, fd43;
mul.f64 fd45, fd34, fd33;
fma.rn.f64 fd46, fd34, fd33, fd45;
mul.f64 fd47, fd28, fd46;
fma.rn.f64 fd48, fd44, fd27, fd47;
mul.f64 fd49, fd27, fd46;
mul.f64 fd50, fd44, fd28;
sub.f64 fd51, fd50, fd49;
ld.global.v2.f64 {fd52, fd53}, [rd5+128];
mul.f64 fd56, fd32, fd53;
fma.rn.f64 fd57, fd52, fd31, fd56;
mul.f64 fd58, fd31, fd53;
mul.f64 fd59, fd52, fd32;
sub.f64 fd60, fd59, fd58;
shl.b32 r8, r5, 5;
and.b32 r9, r8, -256;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 224;
add.s32 r12, r10, r11;
st.shared.v2.f64 [r12], {fd25, fd38};
st.shared.v2.f64 [r12+16], {fd48, fd57};
barrier.sync 0;
mad.lo.s32 r13, r6, -24, r12;
ld.shared.f64 fd61, [r13];
ld.shared.f64 fd62, [r13+64];
ld.shared.f64 fd63, [r13+128];
ld.shared.f64 fd64, [r13+192];
barrier.sync 0;
st.shared.v2.f64 [r12], {fd26, fd41};
st.shared.v2.f64 [r12+16], {fd51, fd60};
barrier.sync 0;
ld.shared.f64 fd65, [r13];
ld.shared.f64 fd66, [r13+64];
ld.shared.f64 fd67, [r13+128];
ld.shared.f64 fd68, [r13+192];
add.f64 fd69, fd61, fd63;
add.f64 fd70, fd65, fd67;
sub.f64 fd71, fd61, fd63;
sub.f64 fd72, fd65, fd67;
add.f64 fd73, fd62, fd64;
add.f64 fd74, fd66, fd68;
sub.f64 fd75, fd62, fd64;
sub.f64 fd76, fd66, fd68;
add.f64 fd77, fd69, fd73;
add.f64 fd78, fd70, fd74;
sub.f64 fd79, fd69, fd73;
sub.f64 fd80, fd70, fd74;
sub.f64 fd81, fd71, fd76;
add.f64 fd82, fd72, fd75;
add.f64 fd83, fd71, fd76;
sub.f64 fd84, fd72, fd75;
and.b32 r14, r5, 4;
bfe.u32 r15, r5, 2, 1;
mul.wide.u32 rd6, r15, 16;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f64 {fd85, fd86}, [rd8];
mul.f64 fd89, fd82, fd86;
fma.rn.f64 fd90, fd85, fd81, fd89;
mul.f64 fd91, fd81, fd86;
mul.f64 fd92, fd85, fd82;
sub.f64 fd93, fd92, fd91;
mul.f64 fd94, fd85, fd85;
mul.f64 fd95, fd86, fd86;
sub.f64 fd96, fd94, fd95;
mul.f64 fd97, fd86, fd85;
fma.rn.f64 fd98, fd86, fd85, fd97;
mul.f64 fd99, fd80, fd98;
fma.rn.f64 fd100, fd96, fd79, fd99;
mul.f64 fd101, fd79, fd98;
mul.f64 fd102, fd96, fd80;
sub.f64 fd103, fd102, fd101;
ld.global.v2.f64 {fd104, fd105}, [rd8+32];
mul.f64 fd108, fd84, fd105;
fma.rn.f64 fd109, fd104, fd83, fd108;
mul.f64 fd110, fd83, fd105;
mul.f64 fd111, fd104, fd84;
sub.f64 fd112, fd111, fd110;
shl.b32 r16, r5, 3;
and.b32 r17, r16, 24;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 128;
add.s32 r20, r18, r19;
st.shared.f64 [r20], fd77;
st.shared.f64 [r20+32], fd90;
st.shared.f64 [r20+64], fd100;
st.shared.f64 [r20+96], fd109;
barrier.sync 0;
mad.lo.s32 r21, r14, -24, r20;
ld.shared.f64 fd113, [r21];
ld.shared.f64 fd114, [r21+64];
ld.shared.f64 fd115, [r21+128];
ld.shared.f64 fd116, [r21+192];
barrier.sync 0;
st.shared.f64 [r20], fd78;
st.shared.f64 [r20+32], fd93;
st.shared.f64 [r20+64], fd103;
st.shared.f64 [r20+96], fd112;
barrier.sync 0;
ld.shared.f64 fd117, [r21];
ld.shared.f64 fd118, [r21+64];
ld.shared.f64 fd119, [r21+128];
ld.shared.f64 fd120, [r21+192];
add.f64 %0, fd113, fd115;
add.f64 %1, fd117, fd119;
add.f64 %2, fd114, fd116;
add.f64 %3, fd118, fd120;
sub.f64 %4, fd113, fd115;
sub.f64 %5, fd117, fd119;
sub.f64 %6, fd114, fd116;
sub.f64 %7, fd118, fd120;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<611, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<24>;
.reg .f64 fd<587>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
add.f64 fd65, %34, %50;
sub.f64 fd67, %34, %50;
add.f64 fd581, %35, %66;
sub.f64 fd68, %35, %66;
add.f64 fd69, %42, %58;
sub.f64 fd71, %42, %58;
add.f64 fd579, %67, %59;
sub.f64 fd72, %67, %59;
add.f64 fd73, fd65, fd69;
sub.f64 fd75, fd65, fd69;
add.f64 fd578, fd581, fd579;
sub.f64 fd76, fd581, fd579;
sub.f64 fd77, fd67, fd72;
add.f64 fd79, fd67, fd72;
add.f64 fd577, fd68, fd71;
sub.f64 fd80, fd68, fd71;
add.f64 fd81, %38, %54;
sub.f64 fd83, %38, %54;
add.f64 fd574, %69, %68;
sub.f64 fd84, %69, %68;
add.f64 fd85, %46, %62;
sub.f64 fd87, %46, %62;
add.f64 fd572, %47, %70;
sub.f64 fd88, %47, %70;
add.f64 fd89, fd81, fd85;
sub.f64 fd91, fd81, fd85;
add.f64 fd571, fd574, fd572;
sub.f64 fd92, fd574, fd572;
sub.f64 fd93, fd83, fd88;
add.f64 fd95, fd83, fd88;
add.f64 fd570, fd84, fd87;
sub.f64 fd96, fd84, fd87;
mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD;
mul.f64 fd98, fd570, 0d3FE6A09E667F3BCD;
sub.f64 fd99, fd97, fd98;
add.f64 fd100, fd97, fd98;
mul.f64 fd568, fd95, 0dBFE6A09E667F3BCD;
mul.f64 fd569, fd96, 0d3FE6A09E667F3BCD;
sub.f64 fd103, fd568, fd569;
mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104;
add.f64 fd106, fd73, fd89;
sub.f64 fd108, fd73, fd89;
add.f64 fd567, fd578, fd571;
sub.f64 fd109, fd578, fd571;
add.f64 fd110, fd77, fd99;
sub.f64 fd112, fd77, fd99;
add.f64 fd566, fd577, fd100;
sub.f64 fd113, fd577, fd100;
sub.f64 fd114, fd75, fd92;
add.f64 fd116, fd75, fd92;
add.f64 fd565, fd76, fd91;
sub.f64 fd117, fd76, fd91;
add.f64 fd118, fd79, fd103;
sub.f64 fd120, fd79, fd103;
add.f64 fd564, fd80, fd105;
sub.f64 fd121, fd80, fd105;
add.f64 fd122, %36, %52;
sub.f64 fd124, %36, %52;
add.f64 fd562, %71, %53;
sub.f64 fd125, %71, %53;
add.f64 fd126, %44, %60;
sub.f64 fd128, %44, %60;
add.f64 fd559, %72, %73;
sub.f64 fd129, %72, %73;
add.f64 fd130, fd122, fd126;
sub.f64 fd132, fd122, fd126;
add.f64 fd558, fd562, fd559;
sub.f64 fd133, fd562, fd559;
sub.f64 fd134, fd124, fd129;
add.f64 fd136, fd124, fd129;
add.f64 fd557, fd125, fd128;
sub.f64 fd137, fd125, fd128;
add.f64 fd138, %40, %56;
sub.f64 fd140, %40, %56;
add.f64 fd555, %41, %74;
sub.f64 fd141, %41, %74;
add.f64 fd142, %48, %64;
sub.f64 fd144, %48, %64;
add.f64 fd553, %75, %65;
sub.f64 fd145, %75, %65;
add.f64 fd146, fd138, fd142;
sub.f64 fd148, fd138, fd142;
add.f64 fd552, fd555, fd553;
sub.f64 fd149, fd555, fd553;
sub.f64 fd150, fd140, fd145;
add.f64 fd152, fd140, fd145;
add.f64 fd551, fd141, fd144;
sub.f64 fd153, fd141, fd144;
mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD;
mul.f64 fd155, fd551, 0d3FE6A09E667F3BCD;
sub.f64 fd156, fd154, fd155;
add.f64 fd157, fd154, fd155;
mul.f64 fd549, fd152, 0dBFE6A09E667F3BCD;
mul.f64 fd550, fd153, 0d3FE6A09E667F3BCD;
sub.f64 fd160, fd549, fd550;
mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161;
add.f64 fd163, fd130, fd146;
sub.f64 fd165, fd130, fd146;
add.f64 fd548, fd558, fd552;
sub.f64 fd166, fd558, fd552;
add.f64 fd167, fd134, fd156;
sub.f64 fd169, fd134, fd156;
add.f64 fd547, fd557, fd157;
sub.f64 fd170, fd557, fd157;
sub.f64 fd171, fd132, fd149;
add.f64 fd173, fd132, fd149;
add.f64 fd546, fd133, fd148;
sub.f64 fd174, fd133, fd148;
add.f64 fd175, fd136, fd160;
sub.f64 fd177, fd136, fd160;
add.f64 fd545, fd137, fd162;
sub.f64 fd178, fd137, fd162;
mul.f64 fd543, fd167, 0d3FED906BCF328D46;
mul.f64 fd544, fd547, 0d3FD87DE2A6AEA963;
sub.f64 fd181, fd543, fd544;
mul.f64 fd182, fd547, 0d3FED906BCF328D46;
fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182;
mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD;
mul.f64 fd185, fd546, 0d3FE6A09E667F3BCD;
sub.f64 fd186, fd184, fd185;
add.f64 fd187, fd184, fd185;
mul.f64 fd189, fd545, 0d3FED906BCF328D46;
mul.f64 fd542, fd175, 0d3FD87DE2A6AEA963;
sub.f64 fd190, fd542, fd189;
mul.f64 fd191, fd545, 0d3FD87DE2A6AEA963;
fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191;
mul.f64 fd194, fd170, 0d3FED906BCF328D46;
mul.f64 fd541, fd169, 0dBFD87DE2A6AEA963;
sub.f64 fd195, fd541, fd194;
mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963;
fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196;
mul.f64 fd539, fd173, 0dBFE6A09E667F3BCD;
mul.f64 fd540, fd174, 0d3FE6A09E667F3BCD;
sub.f64 fd200, fd539, fd540;
mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201;
mul.f64 fd537, fd177, 0dBFED906BCF328D46;
mul.f64 fd538, fd178, 0d3FD87DE2A6AEA963;
sub.f64 fd205, fd537, fd538;
mul.f64 fd206, fd178, 0dBFED906BCF328D46;
fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206;
add.f64 fd210, fd110, fd181;
sub.f64 fd212, fd110, fd181;
add.f64 fd536, fd566, fd183;
sub.f64 fd213, fd566, fd183;
add.f64 fd214, fd114, fd186;
sub.f64 fd216, fd114, fd186;
add.f64 fd535, fd565, fd187;
sub.f64 fd217, fd565, fd187;
add.f64 fd218, fd118, fd190;
sub.f64 fd220, fd118, fd190;
add.f64 fd534, fd564, fd192;
sub.f64 fd221, fd564, fd192;
sub.f64 fd222, fd108, fd166;
add.f64 fd224, fd108, fd166;
add.f64 fd533, fd109, fd165;
sub.f64 fd225, fd109, fd165;
add.f64 fd226, fd112, fd195;
sub.f64 fd228, fd112, fd195;
add.f64 fd532, fd113, fd197;
sub.f64 fd229, fd113, fd197;
add.f64 fd230, fd116, fd200;
sub.f64 fd232, fd116, fd200;
add.f64 fd531, fd117, fd202;
sub.f64 fd233, fd117, fd202;
add.f64 fd234, fd120, fd205;
sub.f64 fd236, fd120, fd205;
add.f64 fd530, fd121, fd207;
sub.f64 fd237, fd121, fd207;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 8;
and.b32 r8, r7, -512;
add.s32 r9, r4, r8;
and.b32 r14, r15, 1;
shl.b32 r10, r15, 4;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 16;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd238, fd239}, [rd5];
mul.f64 fd242, fd536, fd239;
mul.f64 fd244, fd238, fd536;
mul.f64 fd246, fd239, fd239;
mul.f64 fd529, fd238, fd238;
sub.f64 fd247, fd529, fd246;
mul.f64 fd248, fd239, fd238;
fma.rn.f64 fd249, fd239, fd238, fd248;
mul.f64 fd250, fd535, fd249;
mul.f64 fd252, fd247, fd535;
mul.f64 fd527, fd238, fd247;
mul.f64 fd528, fd239, fd249;
sub.f64 fd255, fd527, fd528;
mul.f64 fd526, fd214, fd249;
mul.f64 fd256, fd238, fd249;
fma.rn.f64 fd257, fd239, fd247, fd256;
mul.f64 fd258, fd534, fd257;
mul.f64 fd260, fd255, fd534;
mul.f64 fd262, fd239, fd257;
mul.f64 fd525, fd238, fd255;
sub.f64 fd263, fd525, fd262;
mul.f64 fd524, fd218, fd257;
mul.f64 fd264, fd238, fd257;
fma.rn.f64 fd265, fd239, fd255, fd264;
mul.f64 fd266, fd533, fd265;
mul.f64 fd268, fd263, fd533;
mul.f64 fd270, fd239, fd265;
mul.f64 fd523, fd238, fd263;
sub.f64 fd271, fd523, fd270;
mul.f64 fd522, fd222, fd265;
mul.f64 fd272, fd238, fd265;
fma.rn.f64 fd273, fd239, fd263, fd272;
mul.f64 fd274, fd532, fd273;
mul.f64 fd276, fd271, fd532;
mul.f64 fd520, fd238, fd271;
mul.f64 fd521, fd239, fd273;
sub.f64 fd279, fd520, fd521;
mul.f64 fd519, fd226, fd273;
mul.f64 fd280, fd238, fd273;
fma.rn.f64 fd281, fd239, fd271, fd280;
mul.f64 fd282, fd531, fd281;
mul.f64 fd284, fd279, fd531;
mul.f64 fd286, fd239, fd281;
mul.f64 fd518, fd238, fd279;
sub.f64 fd287, fd518, fd286;
mul.f64 fd517, fd230, fd281;
mul.f64 fd288, fd238, fd281;
fma.rn.f64 fd289, fd239, fd279, fd288;
mul.f64 fd290, fd530, fd289;
mul.f64 fd292, fd287, fd530;
mul.f64 fd294, fd239, fd289;
mul.f64 fd516, fd238, fd287;
sub.f64 fd295, fd516, fd294;
mul.f64 fd515, fd234, fd289;
mul.f64 fd296, fd238, fd289;
fma.rn.f64 fd297, fd239, fd287, fd296;
sub.f64 fd514, fd567, fd548;
mul.f64 fd298, fd514, fd297;
sub.f64 fd513, fd106, fd163;
mul.f64 fd299, fd513, fd297;
mul.f64 fd300, fd295, fd514;
ld.global.v2.f64 {fd301, fd302}, [rd5+32];
mul.f64 fd305, fd213, fd302;
mul.f64 fd307, fd301, fd213;
mul.f64 fd511, fd238, fd301;
mul.f64 fd512, fd239, fd302;
sub.f64 fd310, fd511, fd512;
mul.f64 fd510, fd212, fd302;
mul.f64 fd311, fd238, fd302;
fma.rn.f64 fd312, fd239, fd301, fd311;
mul.f64 fd313, fd217, fd312;
mul.f64 fd315, fd310, fd217;
mul.f64 fd317, fd239, fd312;
mul.f64 fd509, fd238, fd310;
sub.f64 fd318, fd509, fd317;
mul.f64 fd508, fd216, fd312;
mul.f64 fd319, fd238, fd312;
fma.rn.f64 fd320, fd239, fd310, fd319;
mul.f64 fd321, fd221, fd320;
mul.f64 fd323, fd318, fd221;
mul.f64 fd506, fd238, fd318;
mul.f64 fd507, fd239, fd320;
sub.f64 fd326, fd506, fd507;
mul.f64 fd505, fd220, fd320;
mul.f64 fd327, fd238, fd320;
fma.rn.f64 fd328, fd239, fd318, fd327;
mul.f64 fd329, fd225, fd328;
mul.f64 fd331, fd326, fd225;
mul.f64 fd503, fd238, fd326;
mul.f64 fd504, fd239, fd328;
sub.f64 fd334, fd503, fd504;
mul.f64 fd502, fd224, fd328;
mul.f64 fd335, fd238, fd328;
fma.rn.f64 fd336, fd239, fd326, fd335;
mul.f64 fd337, fd229, fd336;
mul.f64 fd339, fd334, fd229;
mul.f64 fd341, fd239, fd336;
mul.f64 fd501, fd238, fd334;
sub.f64 fd342, fd501, fd341;
mul.f64 fd500, fd228, fd336;
mul.f64 fd343, fd238, fd336;
fma.rn.f64 fd344, fd239, fd334, fd343;
mul.f64 fd345, fd233, fd344;
mul.f64 fd347, fd342, fd233;
mul.f64 fd498, fd238, fd342;
mul.f64 fd499, fd239, fd344;
sub.f64 fd350, fd498, fd499;
mul.f64 fd497, fd232, fd344;
mul.f64 fd351, fd238, fd344;
mul.f64 fd496, fd210, fd239;
fma.rn.f64 fd352, fd239, fd342, fd351;
mul.f64 fd353, fd237, fd352;
mul.f64 fd354, fd236, fd352;
mul.f64 fd355, fd350, fd237;
barrier.sync 0;
and.b32 r11, r7, 256;
add.s32 r12, r9, r11;
mov.u32 r17, %tid.x;
and.b32 r16, r17, 1;
sub.f64 fd586, fd567, fd548;
mul.f64 fd585, fd295, fd586;
add.f64 fd356, fd567, fd548;
mov.u32 r19, %tid.x;
and.b32 r18, r19, 1;
sub.f64 fd584, fd106, fd163;
add.f64 fd357, fd106, fd163;
st.shared.v2.f64 [r12], {fd357, fd356};
mov.u32 r21, %tid.x;
and.b32 r20, r21, 1;
mov.u32 r23, %tid.x;
and.b32 r22, r23, 1;
fma.rn.f64 fd358, fd238, fd210, fd242;
sub.f64 fd359, fd244, fd496;
st.shared.v2.f64 [r12+16], {fd358, fd359};
fma.rn.f64 fd360, fd247, fd214, fd250;
sub.f64 fd361, fd252, fd526;
st.shared.v2.f64 [r12+32], {fd360, fd361};
fma.rn.f64 fd362, fd255, fd218, fd258;
sub.f64 fd363, fd260, fd524;
st.shared.v2.f64 [r12+48], {fd362, fd363};
sub.f64 fd364, fd268, fd522;
fma.rn.f64 fd365, fd263, fd222, fd266;
st.shared.v2.f64 [r12+64], {fd365, fd364};
fma.rn.f64 fd366, fd271, fd226, fd274;
sub.f64 fd367, fd276, fd519;
st.shared.v2.f64 [r12+80], {fd366, fd367};
fma.rn.f64 fd368, fd279, fd230, fd282;
sub.f64 fd369, fd284, fd517;
st.shared.v2.f64 [r12+96], {fd368, fd369};
fma.rn.f64 fd370, fd287, fd234, fd290;
sub.f64 fd371, fd292, fd515;
st.shared.v2.f64 [r12+112], {fd370, fd371};
fma.rn.f64 fd372, fd295, fd584, fd298;
sub.f64 fd373, fd585, fd299;
st.shared.v2.f64 [r12+128], {fd372, fd373};
fma.rn.f64 fd374, fd301, fd212, fd305;
sub.f64 fd375, fd307, fd510;
st.shared.v2.f64 [r12+144], {fd374, fd375};
fma.rn.f64 fd376, fd310, fd216, fd313;
sub.f64 fd377, fd315, fd508;
st.shared.v2.f64 [r12+160], {fd376, fd377};
fma.rn.f64 fd378, fd318, fd220, fd321;
sub.f64 fd379, fd323, fd505;
st.shared.v2.f64 [r12+176], {fd378, fd379};
sub.f64 fd380, fd331, fd502;
fma.rn.f64 fd381, fd326, fd224, fd329;
st.shared.v2.f64 [r12+192], {fd381, fd380};
fma.rn.f64 fd382, fd334, fd228, fd337;
sub.f64 fd383, fd339, fd500;
st.shared.v2.f64 [r12+208], {fd382, fd383};
fma.rn.f64 fd384, fd342, fd232, fd345;
sub.f64 fd385, fd347, fd497;
st.shared.v2.f64 [r12+224], {fd384, fd385};
fma.rn.f64 fd386, fd350, fd236, fd353;
sub.f64 fd387, fd355, fd354;
st.shared.v2.f64 [r12+240], {fd386, fd387};
barrier.sync 0;
mad.lo.s32 r13, r22, -240, r12;
ld.shared.v2.f64 {fd388, fd389}, [r13];
ld.shared.v2.f64 {fd392, fd393}, [r13+32];
ld.shared.v2.f64 {fd396, fd397}, [r13+64];
ld.shared.v2.f64 {fd400, fd401}, [r13+96];
ld.shared.v2.f64 {fd404, fd405}, [r13+128];
ld.shared.v2.f64 {fd408, fd409}, [r13+160];
ld.shared.v2.f64 {fd412, fd413}, [r13+192];
ld.shared.v2.f64 {fd416, fd417}, [r13+224];
ld.shared.v2.f64 {fd420, fd421}, [r13+256];
ld.shared.v2.f64 {fd424, fd425}, [r13+288];
ld.shared.v2.f64 {fd428, fd429}, [r13+320];
ld.shared.v2.f64 {fd432, fd433}, [r13+352];
ld.shared.v2.f64 {fd436, fd437}, [r13+384];
ld.shared.v2.f64 {fd440, fd441}, [r13+416];
ld.shared.v2.f64 {fd444, fd445}, [r13+448];
ld.shared.v2.f64 {fd448, fd449}, [r13+480];
add.f64 %1, fd389, fd421;
add.f64 %0, fd388, fd420;
add.f64 %3, fd393, fd425;
add.f64 %2, fd392, fd424;
add.f64 %4, fd396, fd428;
add.f64 %5, fd397, fd429;
add.f64 %6, fd400, fd432;
add.f64 %7, fd401, fd433;
add.f64 %8, fd404, fd436;
add.f64 %9, fd405, fd437;
add.f64 %10, fd408, fd440;
add.f64 %11, fd409, fd441;
add.f64 %13, fd413, fd445;
add.f64 %12, fd412, fd444;
add.f64 %15, fd417, fd449;
add.f64 %14, fd416, fd448;
sub.f64 %17, fd389, fd421;
sub.f64 %16, fd388, fd420;
sub.f64 %19, fd393, fd425;
sub.f64 %18, fd392, fd424;
sub.f64 %21, fd397, fd429;
sub.f64 %20, fd396, fd428;
sub.f64 %23, fd401, fd433;
sub.f64 %22, fd400, fd432;
sub.f64 %25, fd405, fd437;
sub.f64 %24, fd404, fd436;
sub.f64 %27, fd409, fd441;
sub.f64 %26, fd408, fd440;
sub.f64 %29, fd413, fd445;
sub.f64 %28, fd412, fd444;
sub.f64 %31, fd417, fd449;
sub.f64 %30, fd416, fd448;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<612, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<35>;
.reg .f64 fd<97>;
.reg .b64 rd<15>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
sub.f64 fd9, %9, %11;
sub.f64 fd10, %10, %12;
shl.b32 r6, r5, 5;
and.b32 r7, r6, -512;
add.s32 r8, r4, r7;
shl.b32 r9, r5, 4;
cvt.u64.u32 rd2, r9;
and.b64 rd3, rd2, 240;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd11, fd12}, [rd5];
mul.f64 fd15, fd10, fd12;
mul.f64 fd16, fd9, fd12;
mul.f64 fd17, fd11, fd10;
barrier.sync 0;
and.b32 r10, r6, 480;
add.s32 r11, r8, r10;
add.f64 fd18, %10, %12;
add.f64 fd19, %9, %11;
st.shared.v2.f64 [r11], {fd19, fd18};
sub.f64 fd20, fd17, fd16;
fma.rn.f64 fd21, fd11, fd9, fd15;
st.shared.v2.f64 [r11+16], {fd21, fd20};
barrier.sync 0;
and.b32 r12, r9, 240;
sub.s32 r13, r11, r12;
ld.shared.v2.f64 {fd22, fd23}, [r13];
ld.shared.v2.f64 {fd26, fd27}, [r13+256];
sub.f64 fd30, fd22, fd26;
sub.f64 fd31, fd23, fd27;
bfe.u32 r14, r5, 1, 3;
mul.wide.u32 rd6, r14, 16;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f64 {fd32, fd33}, [rd8];
mul.f64 fd36, fd31, fd33;
mul.f64 fd37, fd30, fd33;
mul.f64 fd38, fd32, fd31;
and.b32 r15, r9, 16;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 448;
add.s32 r18, r16, r17;
add.f64 fd39, fd23, fd27;
add.f64 fd40, fd22, fd26;
st.shared.v2.f64 [r18], {fd40, fd39};
fma.rn.f64 fd41, fd32, fd30, fd36;
sub.f64 fd42, fd38, fd37;
st.shared.v2.f64 [r18+32], {fd41, fd42};
barrier.sync 0;
and.b32 r19, r9, 224;
sub.s32 r20, r18, r19;
ld.shared.v2.f64 {fd43, fd44}, [r20];
ld.shared.v2.f64 {fd47, fd48}, [r20+256];
sub.f64 fd51, fd43, fd47;
sub.f64 fd52, fd44, fd48;
bfe.u32 r21, r5, 2, 2;
mul.wide.u32 rd9, r21, 16;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f64 {fd53, fd54}, [rd11];
mul.f64 fd57, fd52, fd54;
mul.f64 fd58, fd51, fd54;
mul.f64 fd59, fd53, fd52;
and.b32 r22, r9, 48;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 384;
add.s32 r25, r23, r24;
add.f64 fd60, fd44, fd48;
add.f64 fd61, fd43, fd47;
st.shared.v2.f64 [r25], {fd61, fd60};
fma.rn.f64 fd62, fd53, fd51, fd57;
sub.f64 fd63, fd59, fd58;
st.shared.v2.f64 [r25+64], {fd62, fd63};
barrier.sync 0;
and.b32 r26, r9, 192;
sub.s32 r27, r25, r26;
ld.shared.v2.f64 {fd64, fd65}, [r27];
ld.shared.v2.f64 {fd68, fd69}, [r27+256];
sub.f64 fd72, fd64, fd68;
sub.f64 fd73, fd65, fd69;
bfe.u32 r28, r5, 3, 1;
mul.wide.u32 rd12, r28, 16;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f64 {fd74, fd75}, [rd14];
mul.f64 fd78, fd73, fd75;
mul.f64 fd79, fd72, fd75;
mul.f64 fd80, fd74, fd73;
and.b32 r29, r9, 112;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 256;
add.s32 r32, r30, r31;
add.f64 fd81, fd65, fd69;
add.f64 fd82, fd64, fd68;
st.shared.v2.f64 [r32], {fd82, fd81};
fma.rn.f64 fd83, fd74, fd72, fd78;
sub.f64 fd84, fd80, fd79;
st.shared.v2.f64 [r32+128], {fd83, fd84};
barrier.sync 0;
and.b32 r33, r9, 128;
sub.s32 r34, r32, r33;
ld.shared.v2.f64 {fd85, fd86}, [r34];
ld.shared.v2.f64 {fd89, fd90}, [r34+256];
add.f64 %1, fd86, fd90;
add.f64 %0, fd85, fd89;
sub.f64 %3, fd86, fd90;
sub.f64 %2, fd85, fd89;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<613, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<35>;
.reg .f64 fd<81>;
.reg .b64 rd<15>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 8;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd9, %9, %11;
add.f64 fd10, %10, %12;
sub.f64 fd11, %9, %11;
sub.f64 fd12, %10, %12;
shl.b32 r6, r5, 4;
cvt.u64.u32 rd2, r6;
and.b64 rd3, rd2, 240;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd13, fd14}, [rd5];
mul.f64 fd17, fd12, fd14;
fma.rn.f64 fd18, fd13, fd11, fd17;
mul.f64 fd19, fd11, fd14;
mul.f64 fd20, fd13, fd12;
sub.f64 fd21, fd20, fd19;
and.b32 r7, r6, -256;
add.s32 r8, r4, r7;
barrier.sync 0;
and.b32 r9, r6, 240;
add.s32 r10, r8, r9;
st.shared.v2.f64 [r10], {fd9, fd18};
barrier.sync 0;
shl.b32 r11, r5, 3;
and.b32 r12, r11, 120;
sub.s32 r13, r10, r12;
ld.shared.f64 fd22, [r13];
ld.shared.f64 fd23, [r13+128];
barrier.sync 0;
st.shared.v2.f64 [r10], {fd10, fd21};
barrier.sync 0;
ld.shared.f64 fd24, [r13];
ld.shared.f64 fd25, [r13+128];
add.f64 fd26, fd22, fd23;
add.f64 fd27, fd24, fd25;
sub.f64 fd28, fd22, fd23;
sub.f64 fd29, fd24, fd25;
bfe.u32 r14, r5, 1, 3;
mul.wide.u32 rd6, r14, 16;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f64 {fd30, fd31}, [rd8];
mul.f64 fd34, fd29, fd31;
fma.rn.f64 fd35, fd30, fd28, fd34;
mul.f64 fd36, fd28, fd31;
mul.f64 fd37, fd30, fd29;
sub.f64 fd38, fd37, fd36;
and.b32 r15, r11, 8;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 224;
add.s32 r18, r16, r17;
st.shared.f64 [r18], fd26;
st.shared.f64 [r18+16], fd35;
barrier.sync 0;
and.b32 r19, r11, 112;
sub.s32 r20, r18, r19;
ld.shared.f64 fd39, [r20];
ld.shared.f64 fd40, [r20+128];
barrier.sync 0;
st.shared.f64 [r18], fd27;
st.shared.f64 [r18+16], fd38;
barrier.sync 0;
ld.shared.f64 fd41, [r20];
ld.shared.f64 fd42, [r20+128];
add.f64 fd43, fd39, fd40;
add.f64 fd44, fd41, fd42;
sub.f64 fd45, fd39, fd40;
sub.f64 fd46, fd41, fd42;
bfe.u32 r21, r5, 2, 2;
mul.wide.u32 rd9, r21, 16;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f64 {fd47, fd48}, [rd11];
mul.f64 fd51, fd46, fd48;
fma.rn.f64 fd52, fd47, fd45, fd51;
mul.f64 fd53, fd45, fd48;
mul.f64 fd54, fd47, fd46;
sub.f64 fd55, fd54, fd53;
and.b32 r22, r11, 24;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 192;
add.s32 r25, r23, r24;
st.shared.f64 [r25], fd43;
st.shared.f64 [r25+32], fd52;
barrier.sync 0;
and.b32 r26, r11, 96;
sub.s32 r27, r25, r26;
ld.shared.f64 fd56, [r27];
ld.shared.f64 fd57, [r27+128];
barrier.sync 0;
st.shared.f64 [r25], fd44;
st.shared.f64 [r25+32], fd55;
barrier.sync 0;
ld.shared.f64 fd58, [r27];
ld.shared.f64 fd59, [r27+128];
add.f64 fd60, fd56, fd57;
add.f64 fd61, fd58, fd59;
sub.f64 fd62, fd56, fd57;
sub.f64 fd63, fd58, fd59;
bfe.u32 r28, r5, 3, 1;
mul.wide.u32 rd12, r28, 16;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f64 {fd64, fd65}, [rd14];
mul.f64 fd68, fd63, fd65;
fma.rn.f64 fd69, fd64, fd62, fd68;
mul.f64 fd70, fd62, fd65;
mul.f64 fd71, fd64, fd63;
sub.f64 fd72, fd71, fd70;
and.b32 r29, r11, 56;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 128;
add.s32 r32, r30, r31;
st.shared.f64 [r32], fd60;
st.shared.f64 [r32+64], fd69;
barrier.sync 0;
and.b32 r33, r11, 64;
sub.s32 r34, r32, r33;
ld.shared.f64 fd73, [r34];
ld.shared.f64 fd74, [r34+128];
barrier.sync 0;
st.shared.f64 [r32], fd61;
st.shared.f64 [r32+64], fd72;
barrier.sync 0;
ld.shared.f64 fd75, [r34];
ld.shared.f64 fd76, [r34+128];
add.f64 %0, fd73, fd74;
add.f64 %1, fd75, fd76;
sub.f64 %2, fd73, fd74;
sub.f64 %3, fd75, fd76;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<614, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<14>;
.reg .f64 fd<452>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 8;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd65, %34, %55;
add.f64 fd66, %35, %57;
sub.f64 fd67, %34, %55;
sub.f64 fd68, %35, %57;
add.f64 fd69, %44, %66;
add.f64 fd70, %46, %67;
sub.f64 fd71, %44, %66;
sub.f64 fd72, %46, %67;
add.f64 fd73, fd65, fd69;
add.f64 fd74, fd66, fd70;
sub.f64 fd75, fd65, fd69;
sub.f64 fd76, fd66, fd70;
sub.f64 fd77, fd67, fd72;
add.f64 fd78, fd68, fd71;
add.f64 fd79, fd67, fd72;
sub.f64 fd80, fd68, fd71;
add.f64 fd81, %39, %60;
add.f64 fd82, %41, %62;
sub.f64 fd83, %39, %60;
sub.f64 fd84, %41, %62;
add.f64 fd85, %50, %71;
add.f64 fd86, %51, %73;
sub.f64 fd87, %50, %71;
sub.f64 fd88, %51, %73;
add.f64 fd89, fd81, fd85;
add.f64 fd90, fd82, fd86;
sub.f64 fd91, fd81, fd85;
sub.f64 fd92, fd82, fd86;
sub.f64 fd93, fd83, fd88;
add.f64 fd94, fd84, fd87;
add.f64 fd95, fd83, fd88;
sub.f64 fd96, fd84, fd87;
mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD;
mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD;
sub.f64 fd99, fd97, fd98;
add.f64 fd100, fd97, fd98;
mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD;
mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD;
sub.f64 fd103, fd101, fd102;
mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104;
add.f64 fd106, fd73, fd89;
add.f64 fd107, fd74, fd90;
sub.f64 fd108, fd73, fd89;
sub.f64 fd109, fd74, fd90;
add.f64 fd110, fd77, fd99;
add.f64 fd111, fd78, fd100;
sub.f64 fd112, fd77, fd99;
sub.f64 fd113, fd78, fd100;
sub.f64 fd114, fd75, fd92;
add.f64 fd115, fd76, fd91;
add.f64 fd116, fd75, fd92;
sub.f64 fd117, fd76, fd91;
add.f64 fd118, fd79, fd103;
add.f64 fd119, fd80, fd105;
sub.f64 fd120, fd79, fd103;
sub.f64 fd121, fd80, fd105;
add.f64 fd122, %36, %58;
add.f64 fd123, %38, %59;
sub.f64 fd124, %36, %58;
sub.f64 fd125, %38, %59;
add.f64 fd126, %47, %68;
add.f64 fd127, %49, %70;
sub.f64 fd128, %47, %68;
sub.f64 fd129, %49, %70;
add.f64 fd130, fd122, fd126;
add.f64 fd131, fd123, fd127;
sub.f64 fd132, fd122, fd126;
sub.f64 fd133, fd123, fd127;
sub.f64 fd134, fd124, fd129;
add.f64 fd135, fd125, fd128;
add.f64 fd136, fd124, fd129;
sub.f64 fd137, fd125, fd128;
add.f64 fd138, %42, %63;
add.f64 fd139, %43, %65;
sub.f64 fd140, %42, %63;
sub.f64 fd141, %43, %65;
add.f64 fd142, %52, %74;
add.f64 fd143, %54, %75;
sub.f64 fd144, %52, %74;
sub.f64 fd145, %54, %75;
add.f64 fd146, fd138, fd142;
add.f64 fd147, fd139, fd143;
sub.f64 fd148, fd138, fd142;
sub.f64 fd149, fd139, fd143;
sub.f64 fd150, fd140, fd145;
add.f64 fd151, fd141, fd144;
add.f64 fd152, fd140, fd145;
sub.f64 fd153, fd141, fd144;
mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD;
mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD;
sub.f64 fd156, fd154, fd155;
add.f64 fd157, fd154, fd155;
mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD;
mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD;
sub.f64 fd160, fd158, fd159;
mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161;
add.f64 fd163, fd130, fd146;
add.f64 fd164, fd131, fd147;
sub.f64 fd165, fd130, fd146;
sub.f64 fd166, fd131, fd147;
add.f64 fd167, fd134, fd156;
add.f64 fd168, fd135, fd157;
sub.f64 fd169, fd134, fd156;
sub.f64 fd170, fd135, fd157;
sub.f64 fd171, fd132, fd149;
add.f64 fd172, fd133, fd148;
add.f64 fd173, fd132, fd149;
sub.f64 fd174, fd133, fd148;
add.f64 fd175, fd136, fd160;
add.f64 fd176, fd137, fd162;
sub.f64 fd177, fd136, fd160;
sub.f64 fd178, fd137, fd162;
mul.f64 fd179, fd167, 0d3FED906BCF328D46;
mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963;
sub.f64 fd181, fd179, fd180;
mul.f64 fd182, fd168, 0d3FED906BCF328D46;
fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182;
mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD;
mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD;
sub.f64 fd186, fd184, fd185;
add.f64 fd187, fd184, fd185;
mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963;
mul.f64 fd189, fd176, 0d3FED906BCF328D46;
sub.f64 fd190, fd188, fd189;
mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963;
fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191;
mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963;
mul.f64 fd194, fd170, 0d3FED906BCF328D46;
sub.f64 fd195, fd193, fd194;
mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963;
fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196;
mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD;
mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD;
sub.f64 fd200, fd198, fd199;
mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201;
mul.f64 fd203, fd177, 0dBFED906BCF328D46;
mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963;
sub.f64 fd205, fd203, fd204;
mul.f64 fd206, fd178, 0dBFED906BCF328D46;
fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206;
add.f64 fd208, fd106, fd163;
add.f64 fd209, fd107, fd164;
sub.f64 fd210, fd106, fd163;
sub.f64 fd211, fd107, fd164;
add.f64 fd212, fd110, fd181;
add.f64 fd213, fd111, fd183;
sub.f64 fd214, fd110, fd181;
sub.f64 fd215, fd111, fd183;
add.f64 fd216, fd114, fd186;
add.f64 fd217, fd115, fd187;
sub.f64 fd218, fd114, fd186;
sub.f64 fd219, fd115, fd187;
add.f64 fd220, fd118, fd190;
add.f64 fd221, fd119, fd192;
sub.f64 fd222, fd118, fd190;
sub.f64 fd223, fd119, fd192;
sub.f64 fd224, fd108, fd166;
add.f64 fd225, fd109, fd165;
add.f64 fd226, fd108, fd166;
sub.f64 fd227, fd109, fd165;
add.f64 fd228, fd112, fd195;
add.f64 fd229, fd113, fd197;
sub.f64 fd230, fd112, fd195;
sub.f64 fd231, fd113, fd197;
add.f64 fd232, fd116, fd200;
add.f64 fd233, fd117, fd202;
sub.f64 fd234, fd116, fd200;
sub.f64 fd235, fd117, fd202;
add.f64 fd236, fd120, fd205;
add.f64 fd237, fd121, fd207;
sub.f64 fd238, fd120, fd205;
sub.f64 fd239, fd121, fd207;
and.b32 r6, r5, 1;
shl.b32 r7, r5, 4;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 16;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd240, fd241}, [rd5];
mul.f64 fd244, fd213, fd241;
fma.rn.f64 fd245, fd240, fd212, fd244;
mul.f64 fd246, fd212, fd241;
mul.f64 fd247, fd240, fd213;
sub.f64 fd248, fd247, fd246;
mul.f64 fd249, fd240, fd240;
mul.f64 fd250, fd241, fd241;
sub.f64 fd251, fd249, fd250;
mul.f64 fd252, fd241, fd240;
fma.rn.f64 fd253, fd241, fd240, fd252;
mul.f64 fd254, fd217, fd253;
fma.rn.f64 fd255, fd251, fd216, fd254;
mul.f64 fd256, fd216, fd253;
mul.f64 fd257, fd251, fd217;
sub.f64 fd258, fd257, fd256;
mul.f64 fd259, fd240, fd251;
mul.f64 fd260, fd241, fd253;
sub.f64 fd261, fd259, fd260;
mul.f64 fd262, fd240, fd253;
fma.rn.f64 fd263, fd241, fd251, fd262;
mul.f64 fd264, fd221, fd263;
fma.rn.f64 fd265, fd261, fd220, fd264;
mul.f64 fd266, fd220, fd263;
mul.f64 fd267, fd261, fd221;
sub.f64 fd268, fd267, fd266;
mul.f64 fd269, fd240, fd261;
mul.f64 fd270, fd241, fd263;
sub.f64 fd271, fd269, fd270;
mul.f64 fd272, fd240, fd263;
fma.rn.f64 fd273, fd241, fd261, fd272;
mul.f64 fd274, fd225, fd273;
fma.rn.f64 fd275, fd271, fd224, fd274;
mul.f64 fd276, fd224, fd273;
mul.f64 fd277, fd271, fd225;
sub.f64 fd278, fd277, fd276;
mul.f64 fd279, fd240, fd271;
mul.f64 fd280, fd241, fd273;
sub.f64 fd281, fd279, fd280;
mul.f64 fd282, fd240, fd273;
fma.rn.f64 fd283, fd241, fd271, fd282;
mul.f64 fd284, fd229, fd283;
fma.rn.f64 fd285, fd281, fd228, fd284;
mul.f64 fd286, fd228, fd283;
mul.f64 fd287, fd281, fd229;
sub.f64 fd288, fd287, fd286;
mul.f64 fd289, fd240, fd281;
mul.f64 fd290, fd241, fd283;
sub.f64 fd291, fd289, fd290;
mul.f64 fd292, fd240, fd283;
fma.rn.f64 fd293, fd241, fd281, fd292;
mul.f64 fd294, fd233, fd293;
fma.rn.f64 fd295, fd291, fd232, fd294;
mul.f64 fd296, fd232, fd293;
mul.f64 fd297, fd291, fd233;
sub.f64 fd298, fd297, fd296;
mul.f64 fd299, fd240, fd291;
mul.f64 fd300, fd241, fd293;
sub.f64 fd301, fd299, fd300;
mul.f64 fd302, fd240, fd293;
fma.rn.f64 fd303, fd241, fd291, fd302;
mul.f64 fd304, fd237, fd303;
fma.rn.f64 fd305, fd301, fd236, fd304;
mul.f64 fd306, fd236, fd303;
mul.f64 fd307, fd301, fd237;
sub.f64 fd308, fd307, fd306;
mul.f64 fd309, fd240, fd301;
mul.f64 fd310, fd241, fd303;
sub.f64 fd311, fd309, fd310;
mul.f64 fd312, fd240, fd303;
fma.rn.f64 fd313, fd241, fd301, fd312;
mul.f64 fd314, fd211, fd313;
fma.rn.f64 fd315, fd311, fd210, fd314;
mul.f64 fd316, fd210, fd313;
mul.f64 fd317, fd311, fd211;
sub.f64 fd318, fd317, fd316;
ld.global.v2.f64 {fd319, fd320}, [rd5+32];
mul.f64 fd323, fd215, fd320;
fma.rn.f64 fd324, fd319, fd214, fd323;
mul.f64 fd325, fd214, fd320;
mul.f64 fd326, fd319, fd215;
sub.f64 fd327, fd326, fd325;
mul.f64 fd328, fd240, fd319;
mul.f64 fd329, fd241, fd320;
sub.f64 fd330, fd328, fd329;
mul.f64 fd331, fd240, fd320;
fma.rn.f64 fd332, fd241, fd319, fd331;
mul.f64 fd333, fd219, fd332;
fma.rn.f64 fd334, fd330, fd218, fd333;
mul.f64 fd335, fd218, fd332;
mul.f64 fd336, fd330, fd219;
sub.f64 fd337, fd336, fd335;
mul.f64 fd338, fd240, fd330;
mul.f64 fd339, fd241, fd332;
sub.f64 fd340, fd338, fd339;
mul.f64 fd341, fd240, fd332;
fma.rn.f64 fd342, fd241, fd330, fd341;
mul.f64 fd343, fd223, fd342;
fma.rn.f64 fd344, fd340, fd222, fd343;
mul.f64 fd345, fd222, fd342;
mul.f64 fd346, fd340, fd223;
sub.f64 fd347, fd346, fd345;
mul.f64 fd348, fd240, fd340;
mul.f64 fd349, fd241, fd342;
sub.f64 fd350, fd348, fd349;
mul.f64 fd351, fd240, fd342;
fma.rn.f64 fd352, fd241, fd340, fd351;
mul.f64 fd353, fd227, fd352;
fma.rn.f64 fd354, fd350, fd226, fd353;
mul.f64 fd355, fd226, fd352;
mul.f64 fd356, fd350, fd227;
sub.f64 fd357, fd356, fd355;
mul.f64 fd358, fd240, fd350;
mul.f64 fd359, fd241, fd352;
sub.f64 fd360, fd358, fd359;
mul.f64 fd361, fd240, fd352;
fma.rn.f64 fd362, fd241, fd350, fd361;
mul.f64 fd363, fd231, fd362;
fma.rn.f64 fd364, fd360, fd230, fd363;
mul.f64 fd365, fd230, fd362;
mul.f64 fd366, fd360, fd231;
sub.f64 fd367, fd366, fd365;
mul.f64 fd368, fd240, fd360;
mul.f64 fd369, fd241, fd362;
sub.f64 fd370, fd368, fd369;
mul.f64 fd371, fd240, fd362;
fma.rn.f64 fd372, fd241, fd360, fd371;
mul.f64 fd373, fd235, fd372;
fma.rn.f64 fd374, fd370, fd234, fd373;
mul.f64 fd375, fd234, fd372;
mul.f64 fd376, fd370, fd235;
sub.f64 fd377, fd376, fd375;
mul.f64 fd378, fd240, fd370;
mul.f64 fd379, fd241, fd372;
sub.f64 fd380, fd378, fd379;
mul.f64 fd381, fd240, fd372;
fma.rn.f64 fd382, fd241, fd370, fd381;
mul.f64 fd383, fd239, fd382;
fma.rn.f64 fd384, fd380, fd238, fd383;
mul.f64 fd385, fd238, fd382;
mul.f64 fd386, fd380, fd239;
sub.f64 fd387, fd386, fd385;
shl.b32 r8, r5, 7;
and.b32 r9, r8, -256;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 128;
add.s32 r12, r10, r11;
st.shared.v2.f64 [r12], {fd208, fd245};
st.shared.v2.f64 [r12+16], {fd255, fd265};
st.shared.v2.f64 [r12+32], {fd275, fd285};
st.shared.v2.f64 [r12+48], {fd295, fd305};
st.shared.v2.f64 [r12+64], {fd315, fd324};
st.shared.v2.f64 [r12+80], {fd334, fd344};
st.shared.v2.f64 [r12+96], {fd354, fd364};
st.shared.v2.f64 [r12+112], {fd374, fd384};
barrier.sync 0;
mad.lo.s32 r13, r6, -120, r12;
ld.shared.f64 fd388, [r13];
ld.shared.f64 fd389, [r13+16];
ld.shared.f64 fd390, [r13+32];
ld.shared.f64 fd391, [r13+48];
ld.shared.f64 fd392, [r13+64];
ld.shared.f64 fd393, [r13+80];
ld.shared.f64 fd394, [r13+96];
ld.shared.f64 fd395, [r13+112];
ld.shared.f64 fd396, [r13+128];
ld.shared.f64 fd397, [r13+144];
ld.shared.f64 fd398, [r13+160];
ld.shared.f64 fd399, [r13+176];
ld.shared.f64 fd400, [r13+192];
ld.shared.f64 fd401, [r13+208];
ld.shared.f64 fd402, [r13+224];
ld.shared.f64 fd403, [r13+240];
barrier.sync 0;
st.shared.v2.f64 [r12], {fd209, fd248};
st.shared.v2.f64 [r12+16], {fd258, fd268};
st.shared.v2.f64 [r12+32], {fd278, fd288};
st.shared.v2.f64 [r12+48], {fd298, fd308};
st.shared.v2.f64 [r12+64], {fd318, fd327};
st.shared.v2.f64 [r12+80], {fd337, fd347};
st.shared.v2.f64 [r12+96], {fd357, fd367};
st.shared.v2.f64 [r12+112], {fd377, fd387};
barrier.sync 0;
ld.shared.f64 fd404, [r13];
ld.shared.f64 fd405, [r13+16];
ld.shared.f64 fd406, [r13+32];
ld.shared.f64 fd407, [r13+48];
ld.shared.f64 fd408, [r13+64];
ld.shared.f64 fd409, [r13+80];
ld.shared.f64 fd410, [r13+96];
ld.shared.f64 fd411, [r13+112];
ld.shared.f64 fd412, [r13+128];
ld.shared.f64 fd413, [r13+144];
ld.shared.f64 fd414, [r13+160];
ld.shared.f64 fd415, [r13+176];
ld.shared.f64 fd416, [r13+192];
ld.shared.f64 fd417, [r13+208];
ld.shared.f64 fd418, [r13+224];
ld.shared.f64 fd419, [r13+240];
add.f64 %0, fd388, fd396;
add.f64 %1, fd404, fd412;
add.f64 %2, fd389, fd397;
add.f64 %3, fd405, fd413;
add.f64 %4, fd390, fd398;
add.f64 %5, fd406, fd414;
add.f64 %6, fd391, fd399;
add.f64 %7, fd407, fd415;
add.f64 %8, fd392, fd400;
add.f64 %9, fd408, fd416;
add.f64 %10, fd393, fd401;
add.f64 %11, fd409, fd417;
add.f64 %12, fd394, fd402;
add.f64 %13, fd410, fd418;
add.f64 %14, fd395, fd403;
add.f64 %15, fd411, fd419;
sub.f64 %16, fd388, fd396;
sub.f64 %17, fd404, fd412;
sub.f64 %18, fd389, fd397;
sub.f64 %19, fd405, fd413;
sub.f64 %20, fd390, fd398;
sub.f64 %21, fd406, fd414;
sub.f64 %22, fd391, fd399;
sub.f64 %23, fd407, fd415;
sub.f64 %24, fd392, fd400;
sub.f64 %25, fd408, fd416;
sub.f64 %26, fd393, fd401;
sub.f64 %27, fd409, fd417;
sub.f64 %28, fd394, fd402;
sub.f64 %29, fd410, fd418;
sub.f64 %30, fd395, fd403;
sub.f64 %31, fd411, fd419;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y));
};


#endif
